//%%file merge.cu

#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cuda_runtime.h>


//Generates random numbers and assigns them to the array
void fillArray(int* arr, int size) {
	for (int i = 0; i < size; i++) {
		arr[i] = rand() % size;
	}
}


__global__ void mergesort_kernal(int* arr,int *data,int begin,int end, int depth){

    if((end-begin) < 2){
        return;
    }

    int middle = (end+begin)/2;
    int left_index = begin;
    int middle_index = middle;
    int current;
    int n = end-begin;
    cudaStream_t s,s1;


    // Launches a new block to sort the left part.
    cudaStreamCreateWithFlags(&s,cudaStreamNonBlocking);
    mergesort_kernal<<< 1, 1, 0, s >>>(arr,data, begin, middle, depth+1);
    cudaStreamDestroy(s);

    // Launches a new block to sort the right part.
    cudaStreamCreateWithFlags(&s1,cudaStreamNonBlocking);
    mergesort_kernal<<< 1, 1, 0, s1 >>>(arr,data, middle, end, depth+1);
    cudaStreamDestroy(s1);

    // Waits for children's work.
    cudaDeviceSynchronize();

    // Merges children's generated partition.
    for (current = begin; current < end; current++) {
        if (left_index < middle && (middle_index >= end || arr[left_index] <= arr[middle_index])){
            data[current] = arr[left_index];
            left_index++;
        }else{
            data[current] = arr[middle_index];
            middle_index++;
        }
    }

    // Copies from the auxiliary memory to the main memory.
    for(current = begin; current < end; current++){
        arr[current] = data[current];
    }
}


//Host function, also acts as a wrapper
void merge_sort(int* a,int n){
    int* d_a;
    int* d_aux;
    int left = 0;
    int right = n;

    //Sets the device depth, bigger the number bigger the depth limit?
    cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 18);

    // Allocate GPU memory.
    cudaMalloc((void**)&d_a,n*sizeof(int));
    cudaMalloc((void**)&d_aux,n*sizeof(int));
    cudaMemcpy(d_a,a, n*sizeof(int), cudaMemcpyHostToDevice);

    // Launch on device
    mergesort_kernal<<< 1, 1 >>>(d_a,d_aux, left, right, 0);

    //Sync device
    cudaDeviceSynchronize();

    // Copy back to CPU memory space
    cudaMemcpy(a,d_a, n*sizeof(int), cudaMemcpyDeviceToHost);

    // Free memory on the device
    cudaFree(d_aux);
    cudaFree(d_a);
    cudaDeviceReset();
}


//print the array
void printArray(int* arr, int n){
    for(int i = 0; i < n; i++){
        std::cout << arr[i] << std::endl;
    }
}


int main(int argc, char *argv[]) {
    //Get the size of the array
    int n = std::atoi(argv[1]);
    
    // Create 6 arrays of size n and allocate memory for them
    int *mergeArray = new int[n];

    //Fill the array with randomly generated numbers between 0 & n
    fillArray(mergeArray, n);

    //Sort using bubble
    merge_sort(mergeArray, n);
    
    
    //Check
    //printArray(mergeArray, n);
    std::cout << "Sort is complete!" << std::endl;
    

    //Deallocate the arrays
    delete[] mergeArray;

    return 0;
}